###Preparing Data for Public Distribution
##Data was downloaded directly from Qualtrics
set.seed(1355234)
d <- read.csv("/Users/Allan/Dropbox/!!Papers/Methods/Science Deserves Better/Rep_Att_Survey/2013-08-16-Replication_in_Social_Science.csv")

# v.names <- d[1,]
# names(d) <- as.character(v.names)
# d[1:3,]

#See the first row for the variable definitions
d <- d[-1,]
#d <- d[d$V1!="ResponseID",]

names(d) <- c("ResponseID","ResponseSet","Name", "ExtDataRef", "Email", "IP", "Status", "StartDate", "EndDate", "Finished",
              "Null", "Null2", "Browser", "Browser.Version", "OS", "Resolution", "Flash", "Java", "Agent", 
              "R1.1", "R1.2", "R1.3", "R1.4", "R1.5", "R1.6", "R1.6.text", "R2.1", "R2.2", "R2.3", "R2.4", "R2.5",
              "R2.6", "R2.7", "R2.8", "R3", "Journal", "Year", "Author", "Other.Info", "Contact", "X")

#Removing my test answers
d <- d[d$Contact!="ALlan Dafoe" & d$Contact!="Allan test" & d$Contact!="test" & d$R1.6.text!="allan",]

#Removing Identifying and Unnecessary Variables
#I removed variables related to the authors of the papers that were replicated so as not to promote unsubstantiated anonymous criticisms.
d <- d[, c("Name", "StartDate", "EndDate", "Finished", "R1.1", "R1.2", "R1.3", "R1.4", "R1.5", "R1.6", "R1.6.text", "R2.1", "R2.2", "R2.3", "R2.4", "R2.5",
           "R2.6", "R2.7", "R2.8", "R3", "Journal", "Year")]
       
setwd("/Users/Allan/Dropbox/!!Papers/Methods/Science Deserves Better/Rep_Files/")
write.table(d,file="Output/rep_data2.csv",sep=",",row.names=F)

       
####Begin Public Replication Code####
#2013-08-16-Replication_in_Social_Science.csv" is not meant to be public, as it contains criticism of published scholarship that is not verified.
#The relevant file for replication purposes is "rep_data2.csv", produced using the above code. 
#Specifically rep_data2.csv does not contain identifying information for the papers the respondents tried to replicate, or for the respondents. 

setwd("/Users/Allan/Dropbox/!!Papers/Methods/Science Deserves Better/Rep_Files/")
d <- read.csv("Output/rep_data2.csv")
              
#Respondents
levels(d$Name) <- c("none", "Dafoe", "King", "none", "Polmeth")
  
#"Students who had taken my "Advanced Quantitative Methods" Ph.D. course in 2012 "
sum(d$Name=="Dafoe")

#"Students who had taken Gary King's "Advanced Quantitative Political Methodology" Ph.D. course"
sum(d$Name=="King")

#"Scholars responding to an email I distributed over the Political Methodology listserve "
sum(d$Name=="Polmeth")

#"responses to the survey"
sum(d$Name=="Polmeth" | d$Name=="King" | d$Name=="Dafoe")



###Cleaning data###
d$Year
d$Year[d$Year==".2007"] <- "2007"

r <- c("R1.1","R1.2", "R1.3", "R1.4", "R1.5", "R1.6")

#Filling in 0s
d[,r][is.na(d[,r])] <- 0
d[,r]

##Responses
responses.w <- c("(1) No replication materials were available online.", 
               "(2) A limited dataset was available to perform the analysis, but no analysis code.", "(3) A limited dataset was available to perform the analysis and some analysis code.",
               "(4) A limited dataset was available to perform the analysis and complete analysis code.", "(5) Many primary datasets were available, as well \n as code to create the final dataset from the \n primary datasets and code to analyze the data.",
               "(6) Other: please explain")
responses.w <- gsub("perform", "perform \n", responses.w)

tot <- sum(d$R1.1==1 | d$R1.2==1 | d$R1.3==1 | d$R1.4==1 | d$R1.5==1 | d$R1.6==1)


responses.n <- c(sum(d$R1.1==1)/tot, 
                 sum(d$R1.2==1)/tot, 
                 sum(d$R1.3==1)/tot, 
                 sum(d$R1.4==1)/tot, 
                 sum(d$R1.5==1)/tot, 
                 sum(d$R1.6==1)/tot)
height <- 6
pdf('Output/Fig2.pdf', width=1.6*height, height=height)
par(mar=c(5.1, 20, 4.1,2.1))
barplot(responses.n, names.arg=responses.w, horiz=TRUE, las=1  , xlab="Proportion of Responses", col="dark blue")
dev.off()
#main="Availability of Replication Files (n=192)"

#Responses for Dafoe class
res <- d$Name=="Dafoe"
tot <- sum(d$R1.1[res]==1 | d$R1.2[res]==1 | d$R1.3[res]==1 | d$R1.4[res]==1 | d$R1.5[res]==1 | d$R1.6[res]==1)

responses.n <- c(sum(d$R1.1[res]==1)/tot, 
                 sum(d$R1.2[res]==1)/tot, 
                 sum(d$R1.3[res]==1)/tot, 
                 sum(d$R1.4[res]==1)/tot, 
                 sum(d$R1.5[res]==1)/tot, 
                 sum(d$R1.6[res]==1)/tot)



height <- 6
pdf('Output/Fig2-c1.pdf', width=1.6*height, height=height)
par(mar=c(5.1, 20, 4.1,2.1))
barplot(responses.n, names.arg=responses.w, horiz=TRUE, las=1,  main="Availability of Replication Files (Dafoe's class, n=16)", xlab="Proportion of Responses", col="dark blue")
dev.off()


#Responses for King's class
res <- d$Name=="King"

tot <- sum(d$R1.1[res]==1 | d$R1.2[res]==1 | d$R1.3[res]==1 | d$R1.4[res]==1 | d$R1.5[res]==1 | d$R1.6[res]==1)

responses.n <- c(sum(d$R1.1[res]==1)/tot, 
                 sum(d$R1.2[res]==1)/tot, 
                 sum(d$R1.3[res]==1)/tot, 
                 sum(d$R1.4[res]==1)/tot, 
                 sum(d$R1.5[res]==1)/tot, 
                 sum(d$R1.6[res]==1)/tot)



height <- 6
pdf('Output/Fig1-c2.pdf', width=1.6*height, height=height)
par(mar=c(5.1, 20, 4.1,2.1))
barplot(responses.n, names.arg=responses.w, horiz=TRUE, las=1,  main="Availability of Replication Files (King's class, n=29)", xlab="Proportion of Responses", col="dark blue")
dev.off()


#Proportion who met standard of providing data and complete analysis code, ignoring Other
#"Ignoring the "Other" category, 36% of these respondents reporte that complete data files and replication code were available"
tot <- d$R1.1==1 | d$R1.2==1 | d$R1.3==1 | d$R1.4==1 | d$R1.5==1 
com <- (d$R1.4==1 | d$R1.5==1) & d$R1.1==0 & d$R1.2==0 & d$R1.3==0
not.com <- (d$R1.1==1 | d$R1.2==1 | d$R1.3==1) & d$R1.4==0 & d$R1.5==0

sum(com)/sum(tot)
#36%


#Proportion for which some aspect of replication files were missing
#"whereas 64% reported that some important element of the replication files were missing"
sum(not.com)/sum(tot)
#64%


#Second question
#Filling in 0s
r <- c("R2.1","R2.2", "R2.3", "R2.4", "R2.5", "R2.6", "R2.7", "R2.8")
d[,r][is.na(d[,r])] <- 0
d[,r]


responses.w2 <- c("(1) I was not able to \n approximately reproduce the main results.", 
                 "(2) I was able to \n approximately reproduce the main results.", "(3) I was able to precisely reproduce the main results.",
                 "(4) I found one or more major technical errors,\n though these didn't change the main results.", "(5) I found that one or more key results \n were driven by a technical error.",
                 "(6) I found that one or more key results \n were driven by an arbitrary aspect of their analysis.", 
                  "(7) I found that one or more key results were fragile \n in a manner that would lead an impartial scholar to \n substantially discount the value of the original study.",
                  "(8) Most or all of the key results were robust.")
  

res <- TRUE
tot <- d$R2.1[res]==1 | d$R2.2[res]==1 | d$R2.3[res]==1 | d$R2.4[res]==1 | d$R2.5[res]==1 | d$R2.6[res]==1 | d$R2.7[res]==1 | d$R2.8[res]==1
tot <- sum(tot)

responses.n2 <- c(sum(d$R2.1[res]==1)/tot, 
                 sum(d$R2.2[res]==1)/tot, 
                 sum(d$R2.3[res]==1)/tot, 
                 sum(d$R2.4[res]==1)/tot, 
                 sum(d$R2.5[res]==1)/tot, 
                 sum(d$R2.6[res]==1)/tot,
                 sum(d$R2.7[res]==1)/tot,
                 sum(d$R2.8[res]==1)/tot)


height <- 8
pdf('Output/Fig3.pdf', width=1.6*height, height=height)
par(mar=c(5.1, 22, 4.1,2.1))
barplot(responses.n2, names.arg=responses.w2, horiz=TRUE, las=1, xlab="Proportion of Responses", col="dark blue")
dev.off()
# main="`Please describe the results of your replication efforts (select all that apply).'",


#Proportion claiming to precisely replicate results. 
#"On the one hand, of those who responded to the reproducibility of the result (responses 1-3), 52% reported that they were “able to precisely reproduce the main results” and only 13% reported that they were “not able to approximately reproduce the main results.”"
tot <- sum(d$R2.1==1 | d$R2.2==1 | d$R2.3==1) 
sum(d$R2.3==1)/tot

sum(d$R2.1==1)/tot

#Proportion of respondents reporting robust, out of all reporting about robustness.
#"Of those who responded to the robustness of the results (responses 4-8), 36% reported that “most or all of the key results were robust”, 20% that there were “major technical errors though these didn’t change the main results”, and 56% that results were not robust (responses 5-7)."
sum(d$R2.8)/sum(d$R2.4==1 | d$R2.5==1 | d$R2.6==1 | d$R2.7==1 | d$R2.8)
sum(d$R2.4==1)/sum(d$R2.4==1 | d$R2.5==1 | d$R2.6==1 | d$R2.7==1 | d$R2.8)
sum(d$R2.5==1 | d$R2.6==1 | d$R2.7==1)/sum(d$R2.4==1 | d$R2.5==1 | d$R2.6==1 | d$R2.7==1 | d$R2.8)







